{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('/home/husein/news/populate-news.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['text', 'title'])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data['title'][0].split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with tf.io.gfile.GFile('newstitle.tsv', \"w\") as outfile:\n",
    "    for i in range(len(data['title'])):\n",
    "        c = cleaning(data['text'][i])\n",
    "        if len(data['title'][i].split()) > 3 and len(c) > 20 and 'javascript' not in c and 'register in order to view' not in c:\n",
    "            outfile.write(\"%s\\t%s\\n\" % (c, cleaning(data['title'][i])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def news_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'newstitle.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def news_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['tajuk: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('news_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'news_dataset',\n",
    "    dataset_fn = news_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [news_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"news_dataset\")\n",
    "ds = nq_task.get_dataset(split='qa.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'tajuk: Ada Eks Orang Istana Jadi Direksi Jiwasraya, Ini Faktanya - Komisi VI DPR RI meminta anggota dewan direksi PT Asuransi Jiwasraya (Persero) periode 2013-2018 dicekal. Mereka dianggap yang paling bertanggung jawab atas kusutnya permasalahan pembayaran polis bancassurance nasabah.Mereka dinilai lalai dalam menjaga prinsip kehati-hatian saat menjual produk JS Saving Plan dalam kurun waktu 2014-2018 hingga mengakibatkan kerugian bagi perusahaan dan nasabah.Salah satu direksi Jiwasraya saat itu adalah Hari Prasetyo yang menjabat sebagai direktur keuangan. Hari juga ditunjuk Kepala Staf Kepresidenan Jenderal TNI (Purn) Moeldoko sebagai Tenaga Ahli Utama Kedeputian III Bidang Kajian dan Pengelolaan Isu-isu Ekonomi Strategis.Dia bergabung berbarengan dengan Ali Mochtar Ngabalin yang ditunjuk sebagai Tenaga Ahli Utama Kedeputian IV Bidang Komunikasi Politik dan Diseminasi Informasi.Namun, untuk periode sekarang, belum diketahui apakah Hari masih menjabat sebagai tenaga ahli utama di KSP. Demikian disampaikan Ngabalin kepada CNBC Indonesia , seperti dilansirRabu (18/12/2019).\"Sudah tidak adinda. Tapi saya belum tahu ya karena sampai sekarang belum ada pemilihan tenaga ahli. Baru ada pemilihan deputi,\" katanya.Uniknya, secara pribadi Ngabalin mengaku tidak mengenal secara personal Hari Prasetyo. Dia bahkan mengaku ingin mengetahui latar belakang mengapa sosok Hari ditanyakan.Sementara eks Tenaga Ahli Utama Kedeputian V KSP Ifdhal Kasim mengaku belum mengetahui apakah Hari akan dipanggil lagi sebagai tenaga ahli utama di lembaga itu.\"Saya belum tahu mas,\" ujarnya.Namun yang pasti, Ifdhal mengaku masih diminta menjabat sebagai tenaga ahli utama. Ia baru akan mulai bertugas pada Januari 2020 mendatang.',\n",
       " 'inputs': array([ 5749,    31,  2540, 14963,  1033,  8560,  2107,   208,    94,\n",
       "         6626,  6686,  8168,  8520,    14,   290, 15922,    38,    13,\n",
       "            7, 29656, 13360, 12700,    13,  3963,   715,   717,  3207,\n",
       "           24,    94,  6626,  8075,   414,  6124,   682,  6686,  8168,\n",
       "         8520,    13,     4,  2598,    16,  5418,     5,  1641,    81,\n",
       "          646,     7, 12729,    24, 17891,   159,     3,   466,  1928,\n",
       "           17,   279, 14729,  7081,   247,    13,  7651,  1089,    38,\n",
       "        21270,  2928,   612,  4783,  6054,  2960,  3066,    13,   476,\n",
       "          722,  3386,     3,  2385,  9533, 23022,    36,  2576,  6276,\n",
       "           55,  2570,     7,  2570,    47,  1263,  1654,  1096,   638,\n",
       "           75,    13, 21671,  4934,    36, 21591,   892,   444,     7,\n",
       "        12729,   268,  7010,  3824,   158,  6796,    22,    13,   476,\n",
       "          722,  3386,     3, 19744,   100,    24,    94,  6626,  6686,\n",
       "         8168,  8520,  1263,    37,    52,  1385,  5047,  5002,  3581,\n",
       "           17, 29346,    85,    24, 11002,  3301, 23525,     3,  1385,\n",
       "           93,    24,  5229,  6928, 16481,   534,  3502,   386, 25352,\n",
       "           47, 26799,  9085,   133,    13,     4, 11826,  6780,     5,\n",
       "         1275,   607,   837,  1232,    85,  5545,  2171, 11269,   386,\n",
       "          429,  4379,   974,  6494, 26594,  5710,    22,  7366,  1062,\n",
       "          472,    47,  6388,     7,  4490,  4651, 10385,    16,     3,\n",
       "         1414,  2846,   342,  1740,  2371,    47,    28,  1612,  1275,\n",
       "          856,  1905, 19425,  2552,   153,    17,    24,  5229,  6928,\n",
       "           85,  5545,  2171, 11269,   386,   429,  4379,   974, 13927,\n",
       "        26594, 10577, 10706,    22,   208,   478, 20307,   104,  4042,\n",
       "         1288,     3, 15087,    14,    25,  1641,    81,   420,    14,\n",
       "          742,  2898,  1516,  1385,   274, 29346,    85,  1340,   628,\n",
       "          351,    24,   560,  5876,     3, 28792,  9667, 19425,  2552,\n",
       "          153,    71,  3500,   398,    13,    14,   105,    24,  1796,\n",
       "         9381,  4626,  1628,    13,     4,  1910, 23108, 24561,     5,\n",
       "            3,     6, 12524,    30,  6976, 14421,     3, 12342,    67,\n",
       "          742,   350,  6311,  4074,  2303,   420,   742,    97,  2327,\n",
       "         1340,   628,     3,  2128,    97,  2327,   430,  4379,    91,\n",
       "           14,     6,   194,     3,  1172,  5763,    38,    14,   156,\n",
       "        23244, 19425,  2552,   153,  3857,    30,  4217,   156,  1937,\n",
       "         1385,  5047,  5002,  3581,     3,   160,  1305,  3857,   474,\n",
       "         1025,  4969,   941,  1438, 22770,  1385,  6137,   103,     3,\n",
       "           75, 11222,  3624,  9188,  5545,  2171, 11269,   386,   429,\n",
       "         4379,   974,  1224,   560,  5876,   574,    79,  3129,  1236,\n",
       "         5295,  3857,   742,  1025,  1516,  1385,    45,  4078,   221,\n",
       "           85,  1340,   628,   351,    24,  6829,    37,     3,     6,\n",
       "          254,   742,   350,    13,  3146,    14,     6, 17671,     3,\n",
       "        15087,    17,   789,    14,   574,    79,  3129,  3857,   274,\n",
       "         4773, 29346,    85,  1340,   628,   351,     3,   395,   143,\n",
       "           45,  1756,  6657,    33,  1247,  3229, 11296,     3,     1]),\n",
       " 'targets_plaintext': b'Pernah Ada Eks Orang Istana Jadi Direksi Jiwasraya',\n",
       " 'targets': array([20300,  2540, 14963,  1033,  8560,  2107,   208,    94,  6626,\n",
       "         6686,  8168,  8520,     1])}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
